{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \"data/nine_dreams/ninedreams.txt\" IS REQUIRED"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SPECIFY FILE ENCODING TYOE IN PYTHON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UTF-8 ENCODING\n"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "print (\"UTF-8 ENCODING\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LOAD PACKAGES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PACKAGES LOADED\n"
     ]
    }
   ],
   "source": [
    "import chardet # https://github.com/chardet/chardet\n",
    "import glob\n",
    "import codecs\n",
    "import sys\n",
    "import os\n",
    "from TextLoader import *\n",
    "from Hangulpy3 import *\n",
    "print (\"PACKAGES LOADED\") "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CONVERT UTF8-ENCODED TXT FILE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def conv_file(fromfile, tofile):\n",
    "    with open(fromfile, \"rb\") as f:\n",
    "        sample_text=f.read(10240)\n",
    "    pred = chardet.detect(sample_text)\n",
    "    if not pred['encoding'] in ('EUC-KR', 'UTF-8', 'CP949', 'UTF-16LE'):\n",
    "        print (\"WARNING! Unknown encoding! : %s = %s\") % (fromfile, pred['encoding'])\n",
    "        pred['encoding'] = \"CP949\" # 못찾으면 기본이 CP949\n",
    "        formfile = fromfile + \".unknown\"\n",
    "    elif pred['confidence'] < 0.9:\n",
    "        print (\"WARNING! Unsured encofing! : %s = %s / %s\")\n",
    "        % (fromfile, pred['confidence'], pred['encoding'])\n",
    "        formfile = fromfile + \".notsure\"\n",
    "    with codecs.open(fromfile, \"r\", encoding=pred['encoding'], errors=\"ignore\") as f:\n",
    "        with codecs.open(tofile, \"w+\", encoding=\"utf8\") as t:\n",
    "            all_text = f.read()\n",
    "            t.write(all_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \"data/nine_dreams/ninedreams_utf8.txt\" IS GENERATED"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "UTF8-CONVERTING DONE\n",
      " [data/nine_dreams/ninedreams_utf8.txt] IS GENERATED\n"
     ]
    }
   ],
   "source": [
    "# SOURCE TXT FILE\n",
    "fromfile = \"data/nine_dreams/ninedreams.txt\"\n",
    "# TARGET TXT FILE\n",
    "tofile   = \"data/nine_dreams/ninedreams_utf8.txt\"\n",
    "conv_file(fromfile, tofile)\n",
    "print (\"UTF8-CONVERTING DONE\")\n",
    "print (\" [%s] IS GENERATED\" % (tofile))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DECOMPOSE HANGUL (THIS PART IS IMPORTANT!)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FUNCTION READY\n"
     ]
    }
   ],
   "source": [
    "def dump_file(filename):\n",
    "    result=u\"\" # <= UNICODE STRING \n",
    "    with codecs.open(filename,\"r\", encoding=\"UTF8\") as f:\n",
    "        for line in f.readlines():\n",
    "            line = tuple(line)\n",
    "            result = result + decompose_text(line)\n",
    "    return result\n",
    "print (\"FUNCTION READY\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PYTHON 2 AND 3 COMPATIBILITY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parsing data/nine_dreams/ninedreams_utf8.txt done\n",
      "\n",
      "\n",
      "  ㅎㅏㄴᴥㄱㅜㄱᴥ ㄱㅜㄱᴥㅁㅜㄴᴥㅎㅏㄱᴥㅅㅏᴥㅅㅏㅇᴥ ㅇㅕㅇᴥ�\n"
     ]
    }
   ],
   "source": [
    "if sys.version_info.major == 2:\n",
    "    parsed_txt = dump_file(tofile).encode(\"utf8\") \n",
    "else:\n",
    "    parsed_txt = dump_file(tofile) \n",
    "\n",
    "print (\"Parsing %s done\" % (tofile))\n",
    "# PRINT FIRST 100 CHARACTERS\n",
    "print (parsed_txt[:100])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# \"data/nine_dreams/input.txt\" IS GENERATED"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved to a txt file\n",
      "<closed file 'data/nine_dreams/input.txt', mode 'w' at 0x7f62ae9a58a0>\n"
     ]
    }
   ],
   "source": [
    "with open(\"data/nine_dreams/input.txt\", \"w\") as text_file:\n",
    "    text_file.write(parsed_txt)\n",
    "print (\"Saved to a txt file\")\n",
    "print (text_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# COMPOSE HANGUL CHARACTER FROM PHONEME "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "오늘 ㅏㅇ사ㅣ ㅇㅕ나ㅕㅇㅛㄱㅇㄹㅆ닿ㄹㄴ시ㅏ항곤ㅛ젆 샘ㄶㅡ니.젆ㅓ앉ㅏ뤝ㅡㄴ짉 읐?ㄴ히ㅇ\n"
     ]
    }
   ],
   "source": [
    "data=[u'\\u3147', u'\\u3157', u'\\u1d25', u'\\u3134', u'\\u3161', u'\\u3139', u'\\u1d25'\n",
    "      , u' ', u'\\u314f', u'\\u3147', u'\\u3145', u'\\u314f', u'\\u1d25', u'\\u1d25'\n",
    "      , u'\\u3163', u'\\u1d25', u' ', u'\\u3147', u'\\u1d25', u'\\u3155', u'\\u1d25'\n",
    "      , u'\\u3134', u'\\u314f', u'\\u1d25', u'\\u3155', u'\\u3147', u'\\u1d25'\n",
    "      , u'\\u315b', u'\\u3131', u'\\u1d25', u'\\u3147', u'\\u3139', u'\\u3146'\n",
    "      , u'\\u1d25', u'\\u3137', u'\\u314f', u'\\u314e', u'\\u3139', u'\\u1d25'\n",
    "      , u'\\u3134', u'\\u1d25', u'\\u3145', u'\\u3163', u'\\u1d25', u'\\u1d25'\n",
    "      , u'\\u314f', u'\\u1d25', u'\\u314e', u'\\u314f', u'\\u3147', u'\\u3131'\n",
    "      , u'\\u3157', u'\\u3134', u'\\u1d25', u'\\u1d25', u'\\u315b', u'\\u1d25'\n",
    "      , u'\\u3148', u'\\u3153', u'\\u3136', u'\\u1d25', u' ', u'\\u3145', u'\\u3150'\n",
    "      , u'\\u3141', u'\\u3136', u'\\u3161', u'\\u3134', u'\\u3163', u'\\u1d25', u'.'\n",
    "      , u'\\u3148', u'\\u3153', u'\\u3134', u'\\u314e', u'\\u3153', u'\\u1d25', u'\\u1d25'\n",
    "      , u'\\u3147', u'\\u314f', u'\\u3134', u'\\u3148', u'\\u314f', u'\\u3139', u'\\u315d'\n",
    "      , u'\\u314c', u'\\u1d25', u'\\u3161', u'\\u3134', u'\\u3148', u'\\u3163', u'\\u313a'\n",
    "      , u'\\u1d25', u' ', u'\\u3147', u'\\u3161', u'\\u3146', u'\\u1d25', u'?', u'\\u3134'\n",
    "      , u'\\u1d25', u'\\u314e', u'\\u3163', u'\\u1d25', u'\\u3147', u'\\u3148', u'\\u314f'\n",
    "      ]\n",
    "print automata(\"\".join(data))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GENERATE \"vocab.pkl\" and \"data.npy\" in \"data/nine_dreams/\" FROM \"data/nine_dreams/input.txt\" "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading preprocessed files\n"
     ]
    }
   ],
   "source": [
    "data_dir    = \"data/nine_dreams\"\n",
    "batch_size  = 50\n",
    "seq_length  = 50\n",
    "data_loader = TextLoader(data_dir, batch_size, seq_length)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DATA_LOADER IS:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "type of 'data_loader' is <type 'dict'>, length is 76\n"
     ]
    }
   ],
   "source": [
    "print ( \"type of 'data_loader' is %s, length is %d\" \n",
    "       % (type(data_loader.vocab), len(data_loader.vocab)) )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DATA_LOADER.VOCAB IS:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_loader.vocab looks like \n",
      "{u'_': 69, u'6': 59, u':': 57, u'\\n': 19, u'4': 67, u'5': 63, u'>': 75, u'!': 52, u' ': 1, u'\"': 28, u'\\u1d25': 0, u\"'\": 49, u')': 46, u'(': 45, u'-': 65, u',': 27, u'.': 24, u'\\u3131': 7, u'0': 73, u'\\u3133': 60, u'\\u3132': 29, u'\\u3135': 50, u'\\u3134': 4, u'\\u3137': 13, u'\\u3136': 44, u'\\u3139': 5, u'\\u3138': 32, u'\\u313b': 55, u'\\u313a': 48, u'\\u313c': 54, u'?': 41, u'3': 66, u'\\u3141': 12, u'\\u3140': 51, u'\\u3143': 47, u'\\u3142': 17, u'\\u3145': 10, u'\\u3144': 43, u'\\u3147': 2, u'\\u3146': 22, u'\\u3149': 40, u'\\u3148': 15, u'\\u314b': 42, u'\\u314a': 23, u'\\u314d': 31, u'\\u314c': 30, u'\\u314f': 3, u'\\u314e': 14, u'\\u3151': 34, u'\\u3150': 21, u'\\u3153': 11, u'\\u3152': 74, u'\\u3155': 18, u'\\u3154': 20, u'\\u3157': 9, u'\\u3156': 39, u'\\u3159': 53, u'\\u3158': 26, u'\\u315b': 38, u'\\u315a': 33, u'\\u315d': 36, u'\\u315c': 16, u'\\u315f': 35, u'\\u315e': 61, u'\\u3161': 8, u'\\u3160': 37, u'\\u3163': 6, u'\\u3162': 25, u'\\x1a': 72, u'9': 64, u'7': 71, u'2': 62, u'1': 58, u'\\u313f': 56, u'\\u313e': 70, u'8': 68} \n"
     ]
    }
   ],
   "source": [
    "print (\"data_loader.vocab looks like \\n%s \" % (data_loader.vocab,))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DATA_LOADER.CHARS IS:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "type of 'data_loader.chars' is <type 'tuple'>, length is 76\n"
     ]
    }
   ],
   "source": [
    "print ( \"type of 'data_loader.chars' is %s, length is %d\" \n",
    "       % (type(data_loader.chars), len(data_loader.chars)) )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CHARS CONVERTS INDEX -> CHAR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_loader.chars looks like \n",
      "(u'\\u1d25', u' ', u'\\u3147', u'\\u314f', u'\\u3134', u'\\u3139', u'\\u3163', u'\\u3131', u'\\u3161', u'\\u3157', u'\\u3145', u'\\u3153', u'\\u3141', u'\\u3137', u'\\u314e', u'\\u3148', u'\\u315c', u'\\u3142', u'\\u3155', u'\\n', u'\\u3154', u'\\u3150', u'\\u3146', u'\\u314a', u'.', u'\\u3162', u'\\u3158', u',', u'\"', u'\\u3132', u'\\u314c', u'\\u314d', u'\\u3138', u'\\u315a', u'\\u3151', u'\\u315f', u'\\u315d', u'\\u3160', u'\\u315b', u'\\u3156', u'\\u3149', u'?', u'\\u314b', u'\\u3144', u'\\u3136', u'(', u')', u'\\u3143', u'\\u313a', u\"'\", u'\\u3135', u'\\u3140', u'!', u'\\u3159', u'\\u313c', u'\\u313b', u'\\u313f', u':', u'1', u'6', u'\\u3133', u'\\u315e', u'2', u'5', u'9', u'-', u'3', u'4', u'8', u'_', u'\\u313e', u'7', u'\\x1a', u'0', u'\\u3152', u'>') \n"
     ]
    }
   ],
   "source": [
    "print (\"data_loader.chars looks like \\n%s \" % (data_loader.chars,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[00]     (00)\n",
      "[01]     (01)\n",
      "[02]     (02)\n",
      "[03]   ㅏ (03)\n",
      "[04]     (04)\n",
      "[05]     (05)\n",
      "[06]   ㅣ (06)\n",
      "[07]     (07)\n",
      "[08]   ㅡ (08)\n",
      "[09]   ㅗ (09)\n",
      "[10]     (10)\n",
      "[11]   ㅓ (11)\n",
      "[12]     (12)\n",
      "[13]     (13)\n",
      "[14]     (14)\n",
      "[15]     (15)\n",
      "[16]   ㅜ (16)\n",
      "[17]     (17)\n",
      "[18]   ㅕ (18)\n",
      "[19]   \n",
      " (19)\n",
      "[20]   ㅔ (20)\n",
      "[21]   ㅐ (21)\n",
      "[22]     (22)\n",
      "[23]     (23)\n",
      "[24]   . (24)\n",
      "[25]   ㅢ (25)\n",
      "[26]   ㅘ (26)\n",
      "[27]   , (27)\n",
      "[28]   \" (28)\n",
      "[29]     (29)\n",
      "[30]     (30)\n",
      "[31]     (31)\n",
      "[32]     (32)\n",
      "[33]   ㅚ (33)\n",
      "[34]   ㅑ (34)\n",
      "[35]   ㅟ (35)\n",
      "[36]   ㅝ (36)\n",
      "[37]   ㅠ (37)\n",
      "[38]   ㅛ (38)\n",
      "[39]   ㅖ (39)\n",
      "[40]     (40)\n",
      "[41]   ? (41)\n",
      "[42]     (42)\n",
      "[43]   ㅄ (43)\n",
      "[44]   ㄶ (44)\n",
      "[45]   ( (45)\n",
      "[46]   ) (46)\n",
      "[47]     (47)\n",
      "[48]   ㄺ (48)\n",
      "[49]   ' (49)\n",
      "[50]   ㄵ (50)\n",
      "[51]   ㅀ (51)\n",
      "[52]   ! (52)\n",
      "[53]   ㅙ (53)\n",
      "[54]   ㄼ (54)\n",
      "[55]   ㄻ (55)\n",
      "[56]   ㄿ (56)\n",
      "[57]   : (57)\n",
      "[58]   1 (58)\n",
      "[59]   6 (59)\n",
      "[60]   ㄳ (60)\n",
      "[61]   ㅞ (61)\n",
      "[62]   2 (62)\n",
      "[63]   5 (63)\n",
      "[64]   9 (64)\n",
      "[65]   - (65)\n",
      "[66]   3 (66)\n",
      "[67]   4 (67)\n",
      "[68]   8 (68)\n",
      "[69]   _ (69)\n",
      "[70]   ㄾ (70)\n",
      "[71]   7 (71)\n",
      "[72]   \u001a (72)\n",
      "[73]   0 (73)\n",
      "[74]   ㅒ (74)\n",
      "[75]   > (75)\n"
     ]
    }
   ],
   "source": [
    "for i, char in enumerate(data_loader.chars):\n",
    "    # GET INDEX OF THE CHARACTER\n",
    "    idx = data_loader.vocab[char]\n",
    "    print (\"[%02d] %03s (%02d)\" \n",
    "           % (i, automata(\"\".join(char)), idx))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
